{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 处理文本有两种方式：字符级别和单词级别\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 字符级别\n",
    "\n",
    "\n",
    "import torch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('../data/p1ch4/jane-austen/1342-0.txt',encoding='utf8') as f:\n",
    "    text = f.read()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'“Impossible, Mr. Bennet, impossible, when I am not acquainted with him'"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# onhot编码字符\n",
    "# 首先将文本分为一行一行的\n",
    "lines = text.split('\\n')\n",
    "line = lines[200]    # 取第200行\n",
    "line"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "torch.Size([70, 128])"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 创建一个张量，来存放整行字符  （使用ASCLL）\n",
    "\n",
    "letter_t = torch.zeros(len(line),128)  # 字符数量为行，128个编码为列\n",
    "letter_t.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 找到每个字符对应的编码的索引\n",
    "for i ,letter in enumerate(line.lower().strip()):\n",
    "    letter_index = ord(letter) if ord(letter)<128 else 0   # 依次找到每个字符在ASCLL码的索引\n",
    "    letter_t[i][letter_index] = 1          # 将onehot中每一行（对应每个字符）中字符对应的编码的索引位置设为1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor([[1., 0., 0.,  ..., 0., 0., 0.],\n",
       "        [0., 0., 0.,  ..., 0., 0., 0.],\n",
       "        [0., 0., 0.,  ..., 0., 0., 0.],\n",
       "        ...,\n",
       "        [0., 0., 0.,  ..., 0., 0., 0.],\n",
       "        [0., 0., 0.,  ..., 0., 0., 0.],\n",
       "        [0., 0., 0.,  ..., 0., 0., 0.]])"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "letter_t"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('“Impossible, Mr. Bennet, impossible, when I am not acquainted with him',\n",
       " ['impossible',\n",
       "  'mr',\n",
       "  'bennet',\n",
       "  'impossible',\n",
       "  'when',\n",
       "  'i',\n",
       "  'am',\n",
       "  'not',\n",
       "  'acquainted',\n",
       "  'with',\n",
       "  'him'])"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 单词级别的onehot编码，应该建立一个词汇表\n",
    "\n",
    "# 定义一个函数，输入文本，输出将其转为小写字母并去掉标点后输出\n",
    "\n",
    "def clean_words(input_str):\n",
    "    punctuation = '.,;:\"!?”“_-'\n",
    "    word_list = input_str.lower().replace('\\n',' ').split()         # 将每个单词分开\n",
    "    word_list = [word.strip(punctuation) for word in word_list]     # 去掉标点\n",
    "    return word_list\n",
    "words_in_line = clean_words(line) # 输入一行\n",
    "line,words_in_line"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(7261, 3394)"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "word_list = sorted(set(clean_words(text)))   # 把整个文本——clean_words 处理后放到集合里并排序（集合没有重复项）、\n",
    "word2index_dict = {word: i for (i,word) in enumerate(word_list)} #word 和 它的索引作为键值对放到字典里\n",
    "\n",
    "len(word2index_dict),word2index_dict['impossible']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " 0 3394  impossible\n",
      " 1 4305  mr\n",
      " 2  813  bennet\n",
      " 3 3394  impossible\n",
      " 4 7078  when\n",
      " 5 3315  i\n",
      " 6  415  am\n",
      " 7 4436  not\n",
      " 8  239  acquainted\n",
      " 9 7148  with\n",
      "10 3215  him\n"
     ]
    }
   ],
   "source": [
    "# 对第200行的句子进行onehot编码\n",
    "word_t = torch.zeros(len(words_in_line),len(word2index_dict))\n",
    "for i,word in enumerate(words_in_line):\n",
    "    word_index = word2index_dict[word]\n",
    "    word_t[i][word_index]=1             # 将相应的位置设为1\n",
    "    print('{:2} {:4}  {}'.format(i,word_index,word))   #打印单词在句子中的索引，在词库中的索引，单词本身"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
